import pandas as pd
import numpy as np
6 Pandas
=pd.DataFrame(np.arange(16).reshape(4,4))
Xdf Xdf
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
0] Xdf[
0 0
1 4
2 8
3 12
Name: 0, dtype: int64
0:3] Xdf[
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
1]<10] Xdf[Xdf[
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
<10] Xdf[Xdf
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | 5.0 | 6.0 | 7.0 |
2 | 8.0 | 9.0 | NaN | NaN |
3 | NaN | NaN | NaN | NaN |
0,3,2]] Xdf[[
0 | 3 | 2 | |
---|---|---|---|
0 | 0 | 3 | 2 |
1 | 4 | 7 | 6 |
2 | 8 | 11 | 10 |
3 | 12 | 15 | 14 |
0:1] Xdf.loc[
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
0:1] Xdf.iloc[
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
0:1] Xdf[
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
# Exercise 2
=pd.DataFrame(dict(X1=np.random.uniform(0,1,15),
DF=np.random.standard_normal(15),
X2=1))
X3 DF
X1 | X2 | X3 | |
---|---|---|---|
0 | 0.002925 | -0.693743 | 1 |
1 | 0.033349 | 0.774359 | 1 |
2 | 0.879925 | -0.317643 | 1 |
3 | 0.810953 | 0.087859 | 1 |
4 | 0.655664 | 0.336143 | 1 |
5 | 0.748835 | -0.021620 | 1 |
6 | 0.971919 | -0.463584 | 1 |
7 | 0.933795 | 0.763103 | 1 |
8 | 0.898300 | -0.311474 | 1 |
9 | 0.382806 | -0.697514 | 1 |
10 | 0.621569 | 1.838894 | 1 |
11 | 0.693038 | 0.949865 | 1 |
12 | 0.629064 | -0.204435 | 1 |
13 | 0.961133 | 2.522102 | 1 |
14 | 0.370875 | 1.190071 | 1 |
'X1']>0.5)&(DF['X1']<=0.8)].index DF[(DF[
Index([4, 5, 10, 11, 12], dtype='int64')
'X1']>0.5)&(DF['X1']<=0.8)].index DF.loc[(DF[
Index([4, 5, 10, 11, 12], dtype='int64')
'X1']>0.5)&(DF['X1']<=0.8),:].index DF.loc[(DF[
Index([4, 5, 10, 11, 12], dtype='int64')
'X1']>0.5)&(DF['X1']<=0.8))].index DF.iloc[np.array((DF[
Index([4, 5, 10, 11, 12], dtype='int64')
'X3','X2','X1']] DF[[
X3 | X2 | X1 | |
---|---|---|---|
0 | 1 | -0.693743 | 0.002925 |
1 | 1 | 0.774359 | 0.033349 |
2 | 1 | -0.317643 | 0.879925 |
3 | 1 | 0.087859 | 0.810953 |
4 | 1 | 0.336143 | 0.655664 |
5 | 1 | -0.021620 | 0.748835 |
6 | 1 | -0.463584 | 0.971919 |
7 | 1 | 0.763103 | 0.933795 |
8 | 1 | -0.311474 | 0.898300 |
9 | 1 | -0.697514 | 0.382806 |
10 | 1 | 1.838894 | 0.621569 |
11 | 1 | 0.949865 | 0.693038 |
12 | 1 | -0.204435 | 0.629064 |
13 | 1 | 2.522102 | 0.961133 |
14 | 1 | 1.190071 | 0.370875 |
'X2']<0,['X1']] DF.loc[DF[
X1 | |
---|---|
0 | 0.002925 |
2 | 0.879925 |
5 | 0.748835 |
6 | 0.971919 |
8 | 0.898300 |
9 | 0.382806 |
12 | 0.629064 |
6.0.0.1 Input / Output functions in Pandas
# read_csv, read_excel, read_table, read_fwf ...
import os
os.getcwd()
'/Users/hwan/Desktop/Homepage/study_24spring'
'pandas.csv') pd.read_csv(
S.No | Name | Age | City | Salary | |
---|---|---|---|---|---|
0 | 1 | Tom | 28 | Toronto | 20000 |
1 | 2 | Lee | 32 | HongKong | 3000 |
2 | 3 | Steven | 43 | Bay Area | 8300 |
3 | 4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',sep=',') pd.read_csv(
S.No | Name | Age | City | Salary | |
---|---|---|---|---|---|
0 | 1 | Tom | 28 | Toronto | 20000 |
1 | 2 | Lee | 32 | HongKong | 3000 |
2 | 3 | Steven | 43 | Bay Area | 8300 |
3 | 4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',index_col=0) pd.read_csv(
Name | Age | City | Salary | |
---|---|---|---|---|
S.No | ||||
1 | Tom | 28 | Toronto | 20000 |
2 | Lee | 32 | HongKong | 3000 |
3 | Steven | 43 | Bay Area | 8300 |
4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',index_col='Name') pd.read_csv(
S.No | Age | City | Salary | |
---|---|---|---|---|
Name | ||||
Tom | 1 | 28 | Toronto | 20000 |
Lee | 2 | 32 | HongKong | 3000 |
Steven | 3 | 43 | Bay Area | 8300 |
Ram | 4 | 38 | Hyderabad | 3900 |
'pandas.csv',header=None) pd.read_csv(
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | S.No | Name | Age | City | Salary |
1 | 1 | Tom | 28 | Toronto | 20000 |
2 | 2 | Lee | 32 | HongKong | 3000 |
3 | 3 | Steven | 43 | Bay Area | 8300 |
4 | 4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',names=[0,1,2,3,4]) pd.read_csv(
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | S.No | Name | Age | City | Salary |
1 | 1 | Tom | 28 | Toronto | 20000 |
2 | 2 | Lee | 32 | HongKong | 3000 |
3 | 3 | Steven | 43 | Bay Area | 8300 |
4 | 4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',names=[1,2,3,4]) pd.read_csv(
1 | 2 | 3 | 4 | |
---|---|---|---|---|
S.No | Name | Age | City | Salary |
1 | Tom | 28 | Toronto | 20000 |
2 | Lee | 32 | HongKong | 3000 |
3 | Steven | 43 | Bay Area | 8300 |
4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',dtype={'Salary':np.float64}) pd.read_csv(
S.No | Name | Age | City | Salary | |
---|---|---|---|---|---|
0 | 1 | Tom | 28 | Toronto | 20000.0 |
1 | 2 | Lee | 32 | HongKong | 3000.0 |
2 | 3 | Steven | 43 | Bay Area | 8300.0 |
3 | 4 | Ram | 38 | Hyderabad | 3900.0 |
'pandas.csv',names=['a','b','c','d','e']) pd.read_csv(
a | b | c | d | e | |
---|---|---|---|---|---|
0 | S.No | Name | Age | City | Salary |
1 | 1 | Tom | 28 | Toronto | 20000 |
2 | 2 | Lee | 32 | HongKong | 3000 |
3 | 3 | Steven | 43 | Bay Area | 8300 |
4 | 4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',names=['a','b','c','d'],header=0,index_col=0) pd.read_csv(
a | b | c | d | |
---|---|---|---|---|
1 | Tom | 28 | Toronto | 20000 |
2 | Lee | 32 | HongKong | 3000 |
3 | Steven | 43 | Bay Area | 8300 |
4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',names=['a','b','c','d'],skiprows=1,index_col=0) pd.read_csv(
a | b | c | d | |
---|---|---|---|---|
1 | Tom | 28 | Toronto | 20000 |
2 | Lee | 32 | HongKong | 3000 |
3 | Steven | 43 | Bay Area | 8300 |
4 | Ram | 38 | Hyderabad | 3900 |
'pandas.csv',na_values=['Tom']) pd.read_csv(
S.No | Name | Age | City | Salary | |
---|---|---|---|---|---|
0 | 1 | NaN | 28 | Toronto | 20000 |
1 | 2 | Lee | 32 | HongKong | 3000 |
2 | 3 | Steven | 43 | Bay Area | 8300 |
3 | 4 | Ram | 38 | Hyderabad | 3900 |
6.0.0.2 Binary Operations
= pd.DataFrame(np.random.randint(0, 20, (2, 2)), columns=list('AB'))
A = pd.DataFrame(np.random.randint(0, 10, (3, 3)), columns=list('BAC'))
B print( A, B, sep="\n\n")
A B
0 0 1
1 18 1
B A C
0 1 9 9
1 4 6 2
2 4 1 0
+B A
A | B | C | |
---|---|---|---|
0 | 9.0 | 2.0 | NaN |
1 | 24.0 | 5.0 | NaN |
2 | NaN | NaN | NaN |
=0) A.add(B,fill_value
A | B | C | |
---|---|---|---|
0 | 9.0 | 2.0 | 9.0 |
1 | 24.0 | 5.0 | 2.0 |
2 | 1.0 | 4.0 | 0.0 |
-B.loc[0,:] B
B | A | C | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 3 | -3 | -7 |
2 | 3 | -8 | -9 |
-B.loc[0:0,:] B
B | A | C | |
---|---|---|---|
0 | 0.0 | 0.0 | 0.0 |
1 | NaN | NaN | NaN |
2 | NaN | NaN | NaN |
0,:]) B.sub(B.loc[
B | A | C | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 3 | -3 | -7 |
2 | 3 | -8 | -9 |
'A'],axis=0) B.sub(B[
B | A | C | |
---|---|---|---|
0 | -8 | 0 | 0 |
1 | -2 | 0 | -4 |
2 | 3 | 0 | -1 |
6.0.0.3 Pandas DataFrame Manipulation
import numpy as np
import pandas as pd
= pd.DataFrame(np.random.rand(7, 5))
df df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 0.254541 | 0.022793 | 0.292160 | 0.256052 | 0.742068 |
1 | 0.959650 | 0.210256 | 0.937732 | 0.523141 | 0.660096 |
2 | 0.694372 | 0.674311 | 0.502167 | 0.164087 | 0.527422 |
3 | 0.591313 | 0.952835 | 0.800817 | 0.357809 | 0.029770 |
4 | 0.004058 | 0.930224 | 0.908414 | 0.104988 | 0.372187 |
5 | 0.062482 | 0.480021 | 0.220693 | 0.820707 | 0.578013 |
6 | 0.678025 | 0.481827 | 0.727163 | 0.119174 | 0.252283 |
0,1]] df[[
0 | 1 | |
---|---|---|
0 | 0.254541 | 0.022793 |
1 | 0.959650 | 0.210256 |
2 | 0.694372 | 0.674311 |
3 | 0.591313 | 0.952835 |
4 | 0.004058 | 0.930224 |
5 | 0.062482 | 0.480021 |
6 | 0.678025 | 0.481827 |